Load libraries

Overview of somatic mutation using BigQuery

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
bigquery_mutation_stat = read.csv("/Users/qqin/Documents/23Q4_all_samples_stat.csv")

p <- ggboxplot(melt(bigquery_mutation_stat[, c("gnomad_23Q4", "popaf_23q2_standard")], variable.name='release_and_database'), 
               x = "release_and_database", y = "value",
               color = "release_and_database", palette =c("#00AFBB", "#FC4E07"),
               add = "jitter", shape = "release_and_database") + ylab('Frequency')
## No id variables; using all as measure variables
my_comparisons <- list(c('gnomad_23Q4', "popaf_23q2_standard"))

print(p + stat_compare_means(comparisons = my_comparisons, method = "t.test") + ylab("Somatic variants after gnomAD filter") + xlab("")) 

Overview of all the top frequent genes in 23Q4

LoF and GoF printing

oncoplot(test_23Q4_ccle_maf, genes=names(sort(table(test_23Q4_ccle_maf@data[grepl('^Gain', test_23Q4_ccle_maf@data$oncokb_effect), 'Hugo_Symbol']), decreasing=T)[1:20]), top = 20, gene_mar=8)

Hotspot mutation printing

oncoplot(test_23Q4_ccle_maf, genes=names(table(test_23Q4_ccle_maf@data[grepl('^Y', test_23Q4_ccle_maf@data$hess_driver) | (test_23Q4_ccle_maf@data$oncokb_hotspot=='Y'), 'Hugo_Symbol'])[1:20]), top = 20)

Cancer type exploration

After filtering by the internal cohort af \(af < 0.05\).

## Warning in FUN(X[[i]], ...): Removed 0 samples with zero mutations.

## Warning in FUN(X[[i]], ...): Removed 0 samples with zero mutations.

## Warning in FUN(X[[i]], ...): Removed 0 samples with zero mutations.

## Warning: Removed 18 rows containing missing values (`position_stack()`).

## Warning: Removed 54 rows containing missing values (`position_stack()`).

Integrate with the CRISPR dependencies

mike_biomarkers = read.csv("~/Downloads/bench_corrs_for_mike_new (1).csv")
mike_biomarkers_mutation = subset(mike_biomarkers, feat_type!='CN' & feat_type!='GE')

Integrate mike’s biomarkers

mike_biomarkers_mutation_nona = mike_biomarkers_mutation[!is.na(mike_biomarkers_mutation$cor),]
mike_biomarkers_mutation_nona = subset(mike_biomarkers_mutation_nona, model=='CERES_CRISPR')

res = list()
for (row_index in 1:nrow(mike_biomarkers_mutation_nona)) {
  feat = mike_biomarkers_mutation_nona[row_index, 'Feat_gene']
  dep = mike_biomarkers_mutation_nona[row_index, 'Dep_Gene']
  res[[paste0(feat, "_", dep)]] = tryCatch({
    model=plot_dependency(feat, dep, any_mutation=T, plot=F)
    c(model[1], model[2], model[3], model[4])
  }, error=function(e) c(NA, NA, NA, NA))
}
## [1] "ARID1B..57492."
## NULL
## 
## ARID1A    Non 
##    169   1036 
##        CRISPR    LoF
## 1  0.02086464 ARID1A
## 2 -0.77398300 ARID1A
## 3 -0.06621979 ARID1A
## 4 -1.80534223 ARID1A
## 5 -0.61172793 ARID1A
## 6 -0.47491534 ARID1A
## [1] "BRAF..673."
## NULL
## 
## BRAF  Non 
##  127 1078 
##        CRISPR  LoF
## 1 -0.09305552 BRAF
## 2 -1.81956333 BRAF
## 3 -0.73334082 BRAF
## 4 -1.32623136 BRAF
## 5 -0.79960230 BRAF
## 6 -1.31084344 BRAF
## [1] "EGF..1950."
## NULL
## 
## EGFR  Non 
##   73 1132 
##         CRISPR  LoF
## 1  0.139012119 EGFR
## 2 -0.007850668 EGFR
## 3  0.071310771 EGFR
## 4 -0.070910030 EGFR
## 5 -0.063671598 EGFR
## 6  0.131968428 EGFR
## [1] "KRAS..3845."
## NULL
## 
## KRAS  Non 
##  197 1008 
##      CRISPR  LoF
## 1 -2.475868 KRAS
## 2 -2.246003 KRAS
## 3 -1.845185 KRAS
## 4 -3.137764 KRAS
## 5 -2.391494 KRAS
## 6 -0.827603 KRAS
## [1] "NRAS..4893."
## NULL
## 
##  Non NRAS 
## 1138   67 
##       CRISPR  LoF
## 1 -0.0975696 NRAS
## 2 -1.7831998 NRAS
## 3 -1.6068707 NRAS
## 4 -1.0833982 NRAS
## 5 -1.2798583 NRAS
## 6 -1.3879226 NRAS
## [1] "MTOR..2475."
## NULL
## 
##    Non PIK3CA 
##   1062    143 
##       CRISPR    LoF
## 1 -0.8464532 PIK3CA
## 2 -1.1011345 PIK3CA
## 3 -0.8463668 PIK3CA
## 4 -1.1620927 PIK3CA
## 5 -1.4386328 PIK3CA
## 6 -1.0168104 PIK3CA
## [1] "MDM2..4193."
## NULL
## 
##  Non TP53 
##  386  819 
##        CRISPR  LoF
## 1 -0.31832140 TP53
## 2 -0.03748236 TP53
## 3 -0.14197657 TP53
## 4 -1.62768735 TP53
## 5 -0.25559085 TP53
## 6 -0.44362467 TP53
## [1] "CTNNB1..1499."
## NULL
## 
##  APC  Non 
##  142 1063 
##       CRISPR LoF
## 1 -0.9788010 APC
## 2 -1.3508856 APC
## 3 -0.4138809 APC
## 4 -0.2129251 APC
## 5 -0.9388554 APC
## 6 -0.2255311 APC
## [1] "CTNNB1..1499."
## NULL
## 
## CTNNB1    Non 
##     61   1144 
##      CRISPR    LoF
## 1 -1.024287 CTNNB1
## 2 -0.978801 CTNNB1
## 3 -1.076761 CTNNB1
## 4 -1.363486 CTNNB1
## 5 -1.360818 CTNNB1
## 6 -1.708011 CTNNB1
## [1] "ERBB2..2064."
## NULL
## 
## ERBB2   Non 
##    47  1158 
##       CRISPR   LoF
## 1 -0.1619255 ERBB2
## 2 -0.3871655 ERBB2
## 3 -0.2942801 ERBB2
## 4 -0.3821129 ERBB2
## 5 -0.2863837 ERBB2
## 6 -1.4108343 ERBB2
## [1] "ERBB4..2066."
## NULL
## 
## ERBB4   Non 
##    84  1121 
##        CRISPR   LoF
## 1 -0.21059930 ERBB4
## 2 -0.15385767 ERBB4
## 3 -0.04447189 ERBB4
## 4 -0.24311890 ERBB4
## 5 -0.18806732 ERBB4
## 6 -0.10381109 ERBB4
## [1] "EZH2..2146."
## NULL
## 
## EZH2  Non 
##   39 1166 
##       CRISPR  LoF
## 1 -0.1434961 EZH2
## 2 -0.1285251 EZH2
## 3 -1.5275773 EZH2
## 4 -1.0537890 EZH2
## 5 -0.7269203 EZH2
## 6 -0.9224066 EZH2
## [1] "FLT3..2322."
## NULL
## 
## FLT3  Non 
##   54 1151 
##         CRISPR  LoF
## 1  0.053693382 FLT3
## 2 -1.113222142 FLT3
## 3  0.003950539 FLT3
## 4 -1.990717193 FLT3
## 5 -0.024649067 FLT3
## 6  0.045250929 FLT3
## [1] "HRAS..3265."
## NULL
## 
## HRAS  Non 
##   20 1185 
##        CRISPR  LoF
## 1 -1.29103793 HRAS
## 2  0.04029183 HRAS
## 3 -1.80538658 HRAS
## 4 -0.56986289 HRAS
## 5 -0.14533754 HRAS
## 6 -0.27036669 HRAS
## [1] "IDH1..3417."
## NULL
## 
## IDH1  Non 
##   18 1187 
##         CRISPR  LoF
## 1 -0.379589070 IDH1
## 2 -0.109113210 IDH1
## 3 -0.001456304 IDH1
## 4  0.035921015 IDH1
## 5 -0.200207562 IDH1
## 6  0.130237240 IDH1
## [1] "KIT..3815."
## NULL
## 
##  KIT  Non 
##   47 1158 
##        CRISPR LoF
## 1 -0.11250366 KIT
## 2 -0.02121563 KIT
## 3 -0.11054184 KIT
## 4 -0.85855100 KIT
## 5 -0.14079311 KIT
## 6 -0.12120066 KIT
## [1] "MAP3K1..4214."
## NULL
## 
## MAP2K1    Non 
##     24   1181 
##        CRISPR    LoF
## 1 -0.13819325 MAP2K1
## 2 -0.10729144 MAP2K1
## 3 -0.07475951 MAP2K1
## 4 -0.20591925 MAP2K1
## 5 -0.12886647 MAP2K1
## 6 -0.09137880 MAP2K1
## [1] "MTOR..2475."
## NULL
## 
## MTOR  Non 
##   73 1132 
##      CRISPR  LoF
## 1 -1.583739 MTOR
## 2 -1.428384 MTOR
## 3 -1.077943 MTOR
## 4 -1.295943 MTOR
## 5 -1.514911 MTOR
## 6 -1.407657 MTOR
## [1] "SMARCA2..6595."
## NULL
## 
##     Non SMARCA4 
##    1057     148 
##       CRISPR     LoF
## 1 -0.1365668 SMARCA4
## 2  0.2152061 SMARCA4
## 3 -1.4339477 SMARCA4
## 4 -0.4926894 SMARCA4
## 5  0.1038026 SMARCA4
## 6 -0.6954682 SMARCA4
## [1] "PIK3CA..5290."
## NULL
## 
##  Non PTEN 
## 1064  141 
##        CRISPR  LoF
## 1 -0.33078062 PTEN
## 2 -0.47265240 PTEN
## 3 -0.50748704 PTEN
## 4 -0.06810845 PTEN
## 5 -0.34723434 PTEN
## 6 -0.23058936 PTEN
## [1] "EIF4E..1977."
## NULL
## 
##  Non TSC1 
## 1178   27 
##      CRISPR  LoF
## 1 -1.565571 TSC1
## 2 -1.381869 TSC1
## 3 -2.342957 TSC1
## 4 -1.702713 TSC1
## 5 -1.944665 TSC1
## 6 -1.493313 TSC1
mike_biomarkers_mutation_nona[, 'effect_size_23Q4'] = sapply(res, function(x) x[1])
mike_biomarkers_mutation_nona[, 'FDR_23Q4'] = sapply(res, function(x) x[2])
mike_biomarkers_mutation_nona[, 'Cor_23Q4'] = sapply(res, function(x) x[4])
mike_biomarkers_mutation_nona[, 'Corpvalue_23Q4'] = sapply(res, function(x) x[3])

mike_biomarkers_mutation_nona[, 'Significant_23Q4'] <- ifelse(mike_biomarkers_mutation_nona[, 'FDR_23Q4'] < 0.05 & mike_biomarkers_mutation_nona[, 'effect_size_23Q4'] > 0, "Positive FDR < 0.05", "Not Sig")
mike_biomarkers_mutation_nona[, 'Significant_23Q4'][mike_biomarkers_mutation_nona[, 'FDR_23Q4'] < 0.05 & mike_biomarkers_mutation_nona[, 'effect_size_23Q4'] < 0] <- "Negative FDR < 0.05"

mike_biomarkers_mutation_nona[, 'CRISPR_Mutation'] = paste0(mike_biomarkers_mutation_nona$Dep_Gene, "~", mike_biomarkers_mutation_nona$Feat_gene)

p1 = ggplot(mike_biomarkers_mutation_nona, aes(x = -log10(p.value), y = -log10(Corpvalue_23Q4))) +
  geom_point(size=5) + xlab("-log 10 Before 23Q2 correlation p value") + ylab("-log 10 23Q4 correlation p value") + geom_abline() + xlim(0, 250) + ylim(0, 250)
p2 =  ggplot(mike_biomarkers_mutation_nona, aes(x = cor, y = Cor_23Q4)) +
  geom_point(size=5) + xlab("Before 23Q2 Correlation") + ylab("23Q4 correlation") + geom_abline() + xlim(-1, 1) + ylim(-1, 1)

p1 / p2

ggplot(mike_biomarkers_mutation_nona, aes(x = effect_size_23Q4, y = -log10(FDR_23Q4))) +
  geom_point(aes(color = Significant_23Q4), size=6) +
  scale_color_manual(values = c("blue", "grey",  "red" )) +
  theme_bw(base_size = 20) + theme(legend.position = "bottom") +   xlim(-1.5, 1.5) +
  geom_text_repel(
    data = subset(mike_biomarkers_mutation_nona, FDR_23Q4 < 0.05),
    aes(label = CRISPR_Mutation),
    size = 6,
    box.padding = unit(1.5, "lines"),
    point.padding = unit(0.5, "lines")
  )
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Signature analysis and embedding

test_23Q4_ccle_maf.tnm = trinucleotideMatrix(maf = test_23Q4_ccle_maf, prefix = '', add = TRUE, ref_genome = "BSgenome.Hsapiens.UCSC.hg38")
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:NMF':
## 
##     nrun
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:lubridate':
## 
##     %within%
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## -Extracting 5' and 3' adjacent bases
## -Extracting +/- 20bp around mutated bases for background C>T estimation
## -Estimating APOBEC enrichment scores
## --Performing one-way Fisher's test for APOBEC enrichment
## ---APOBEC related mutations are enriched in  12.734 % of samples (APOBEC enrichment score > 2 ;  374  of  2937  samples)
## -Creating mutation matrix
## --matrix of dimension 2937x96
write.table(test_23Q4_ccle_maf.tnm$nmf_matrix, file="23Q4_nmf_maftools.txt", quote=F, sep='\t')
plotApobecDiff(tnm = test_23Q4_ccle_maf.tnm, maf = test_23Q4_ccle_maf, pVal = 0.2)
## --Possible FLAGS among top ten genes:
##   TTN
##   MUC16
##   OBSCN
## -Processing clinical data
## --Possible FLAGS among top ten genes:
##   TTN
##   MUC16
##   OBSCN
## -Processing clinical data
## Showing top 10 of 1624 differentially mutated genes

## $results
##        Hugo_Symbol Enriched nonEnriched         pval        or     ci.up
##     1:        TP53      288        1560 6.696458e-10 2.1525833 2.8082872
##     2:      PIK3CA       63         221 3.312864e-06 2.1460239 2.9284523
##     3:      CDKN2A       73         279 7.625498e-06 1.9848605 2.6539443
##     4:       TENM3       13         250 1.664036e-05 0.3332583 0.5886347
##     5:      JARID2        4         144 2.173181e-05 0.1816654 0.4797932
##    ---                                                                  
## 18789:      ZFAND6        1          13 1.000000e+00 0.5259730 3.5210390
## 18790:     ZMYND19        1          13 1.000000e+00 0.5259730 3.5210390
## 18791:     ZNF804A       24         168 1.000000e+00 0.9775562 1.5309304
## 18792:    ARHGEF28       15         106 1.000000e+00 0.9684993 1.6942178
## 18793:       RGSL1       15         106 1.000000e+00 0.9684993 1.6942178
##            ci.low      adjPval
##     1: 1.66308465 1.258465e-05
##     2: 1.55684095 3.112933e-02
##     3: 1.47183837 4.776866e-02
##     4: 0.17307258 6.696016e-02
##     5: 0.04854586 6.696016e-02
##    ---                        
## 18789: 0.01234483 1.000000e+00
## 18790: 0.01234483 1.000000e+00
## 18791: 0.60040903 1.000000e+00
## 18792: 0.51769651 1.000000e+00
## 18793: 0.51769651 1.000000e+00
## 
## $SampleSummary
##         Cohort SampleSize    Mean Median
## 1:    Enriched        374 294.457  233.5
## 2: nonEnriched       2563 369.921  175.0

Extract signatures

#test_23Q4_ccle_maf.tnm.sig = extractSignatures(mat = test_23Q4_ccle_maf.tnm, n = 60)

test_23Q4_ccle_maf.tnm.sig = extractSignatures(mat = test_23Q4_ccle_maf.tnm, n = 60)
## -Running NMF for factorization rank: 60
## -Finished in00:03:35 elapsed (00:02:50 cpu)

Mutation embedding for cell lines

NMF_23Q4_cellline = t(test_23Q4_ccle_maf.tnm.sig$nmfObj@fit@H)

library(umap)
NMF_23Q4_cellline.umap = umap(NMF_23Q4_cellline)

colnames(NMF_23Q4_cellline.umap$layout) = c("Dim1", "Dim2")

library(reshape2)

NMF_umap_plot.data = data.frame(NMF_23Q4_cellline.umap$layout)

NMF_umap_plot.data$Celltype = metadata[match(rownames(NMF_umap_plot.data), metadata$SequencingID), 'Lineage']

ggplot(NMF_umap_plot.data)+geom_point(aes(x=Dim1, y=Dim2, colour=Celltype))

test_23Q4_ccle_maf.tnm.sig.cosm = compareSignatures(nmfRes = test_23Q4_ccle_maf.tnm.sig, sig_db = "SBS")
## -Comparing against COSMIC signatures
## ------------------------------------
## --Found Signature_1 most similar to SBS52
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.939]
## --Found Signature_2 most similar to SBS31
##    Aetiology: Prior chemotherapy treatment with platinum drugs [cosine-similarity: 0.847]
## --Found Signature_3 most similar to SBS60
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.945]
## --Found Signature_4 most similar to SBS32
##    Aetiology: Prior treatment with azathioprine to induce immunosuppression [cosine-similarity: 0.601]
## --Found Signature_5 most similar to SBS7d
##    Aetiology: UV exposure [cosine-similarity: 0.901]
## --Found Signature_6 most similar to SBS41
##    Aetiology: Unknown [cosine-similarity: 0.129]
## --Found Signature_7 most similar to SBS25
##    Aetiology: Possible exposure to chemotherapy [cosine-similarity: 0.621]
## --Found Signature_8 most similar to SBS21
##    Aetiology: DNA mismatch repair deficiency [cosine-similarity: 0.812]
## --Found Signature_9 most similar to SBS17b
##    Aetiology: Unknown [cosine-similarity: 0.952]
## --Found Signature_10 most similar to SBS36
##    Aetiology: Defective base excision repair due to biallelic germline or somatic MUTYH mutations [cosine-similarity: 0.26]
## --Found Signature_11 most similar to SBS10b
##    Aetiology: Polymerase epsilon exonuclease domain mutations [cosine-similarity: 0.917]
## --Found Signature_12 most similar to SBS19
##    Aetiology: Unknown [cosine-similarity: 0.529]
## --Found Signature_13 most similar to SBS13
##    Aetiology: APOBEC Cytidine Deaminase (C>G) [cosine-similarity: 0.973]
## --Found Signature_14 most similar to SBS8
##    Aetiology: Unknown [cosine-similarity: 0.139]
## --Found Signature_15 most similar to SBS53
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.926]
## --Found Signature_16 most similar to SBS33
##    Aetiology: Unknown [cosine-similarity: 0.892]
## --Found Signature_17 most similar to SBS24
##    Aetiology: exposures to aflatoxin [cosine-similarity: 0.624]
## --Found Signature_18 most similar to SBS9
##    Aetiology: defects in polymerase-eta [cosine-similarity: 0.259]
## --Found Signature_19 most similar to SBS50
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.505]
## --Found Signature_20 most similar to SBS32
##    Aetiology: Prior treatment with azathioprine to induce immunosuppression [cosine-similarity: 0.486]
## --Found Signature_21 most similar to SBS44
##    Aetiology: Defective DNA mismatch repair [cosine-similarity: 0.408]
## --Found Signature_22 most similar to SBS46
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.353]
## --Found Signature_23 most similar to SBS1
##    Aetiology: spontaneous or enzymatic deamination of 5-methylcytosine [cosine-similarity: 0.757]
## --Found Signature_24 most similar to SBS39
##    Aetiology: Unknown [cosine-similarity: 0.733]
## --Found Signature_25 most similar to SBS46
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.27]
## --Found Signature_26 most similar to SBS55
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.309]
## --Found Signature_27 most similar to SBS42
##    Aetiology: Occupational exposure to haloalkanes [cosine-similarity: 0.69]
## --Found Signature_28 most similar to SBS55
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.375]
## --Found Signature_29 most similar to SBS16
##    Aetiology: Unknown [cosine-similarity: 0.82]
## --Found Signature_30 most similar to SBS51
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.605]
## --Found Signature_31 most similar to SBS24
##    Aetiology: exposures to aflatoxin [cosine-similarity: 0.322]
## --Found Signature_32 most similar to SBS54
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.621]
## --Found Signature_33 most similar to SBS44
##    Aetiology: Defective DNA mismatch repair [cosine-similarity: 0.433]
## --Found Signature_34 most similar to SBS4
##    Aetiology: exposure to tobacco (smoking) mutagens [cosine-similarity: 0.429]
## --Found Signature_35 most similar to SBS50
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.331]
## --Found Signature_36 most similar to SBS47
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.33]
## --Found Signature_37 most similar to SBS7c
##    Aetiology: UV exposure [cosine-similarity: 0.896]
## --Found Signature_38 most similar to SBS28
##    Aetiology: Unknown [cosine-similarity: 0.973]
## --Found Signature_39 most similar to SBS48
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.966]
## --Found Signature_40 most similar to SBS84
##    Aetiology: Activity of activation-induced cytidine deaminase (AID) [cosine-similarity: 0.28]
## --Found Signature_41 most similar to SBS22
##    Aetiology: exposure to aristolochic acid [cosine-similarity: 0.817]
## --Found Signature_42 most similar to SBS22
##    Aetiology: exposure to aristolochic acid [cosine-similarity: 0.201]
## --Found Signature_43 most similar to SBS39
##    Aetiology: Unknown [cosine-similarity: 0.095]
## --Found Signature_44 most similar to SBS16
##    Aetiology: Unknown [cosine-similarity: 0.454]
## --Found Signature_45 most similar to SBS7a
##    Aetiology: UV exposure [cosine-similarity: 0.754]
## --Found Signature_46 most similar to SBS39
##    Aetiology: Unknown [cosine-similarity: 0.179]
## --Found Signature_47 most similar to SBS49
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.904]
## --Found Signature_48 most similar to SBS84
##    Aetiology: Activity of activation-induced cytidine deaminase (AID) [cosine-similarity: 0.673]
## --Found Signature_49 most similar to SBS34
##    Aetiology: Unknown [cosine-similarity: 0.399]
## --Found Signature_50 most similar to SBS17a
##    Aetiology: Unknown [cosine-similarity: 0.965]
## --Found Signature_51 most similar to SBS53
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.233]
## --Found Signature_52 most similar to SBS18
##    Aetiology: Possibly damage by reactive oxygen species [cosine-similarity: 0.48]
## --Found Signature_53 most similar to SBS54
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.643]
## --Found Signature_54 most similar to SBS20
##    Aetiology: Concurrent POLD1 mutations and defective DNA mismatch repair [cosine-similarity: 0.85]
## --Found Signature_55 most similar to SBS2
##    Aetiology: APOBEC Cytidine Deaminase (C>T) [cosine-similarity: 0.86]
## --Found Signature_56 most similar to SBS58
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.747]
## --Found Signature_57 most similar to SBS59
##    Aetiology: Possible sequencing artefact [cosine-similarity: 0.946]
## --Found Signature_58 most similar to SBS10a
##    Aetiology: Polymerase epsilon exonuclease domain mutations [cosine-similarity: 0.987]
## --Found Signature_59 most similar to SBS1
##    Aetiology: spontaneous or enzymatic deamination of 5-methylcytosine [cosine-similarity: 0.407]
## --Found Signature_60 most similar to SBS15
##    Aetiology: Defective DNA mismatch repair [cosine-similarity: 0.811]
## ------------------------------------
pheatmap(mat = test_23Q4_ccle_maf.tnm.sig.cosm$cosine_similarities, cluster_rows = FALSE, main = "cosine similarity against validated signatures", width=20)

sig.cosine.vector = sapply(test_23Q4_ccle_maf.tnm.sig.cosm$best_match, function(x) x$best_match)
sig.cosine.vector = as.numeric(gsub("Best match: SBS.* \\[cosine-similarity: ([0-9].[0-9]*)\\]", "\\1", sig.cosine.vector))

names(sig.cosine.vector) = sapply(test_23Q4_ccle_maf.tnm.sig.cosm$best_match, function(x) x$aetiology)

sig.cosine.df = data.frame(rank=rank(sig.cosine.vector), cosine_similarity=sig.cosine.vector, aetiology=sapply(test_23Q4_ccle_maf.tnm.sig.cosm$best_match, function(x) x$aetiology))
sig.cosine.df = sig.cosine.df %>%
  group_by(aetiology) %>%
  summarise(Cosine=mean(cosine_similarity, na.rm=T)) %>%
  arrange(desc(Cosine))
sig.cosine.df$rank = rank(sig.cosine.df$Cosine)

# in total 60 NMF signatures, aggregates to 21 by mean cosine similarity

ggplot(sig.cosine.df, aes(x=rank, y=Cosine, colour=aetiology))+geom_point()+
  geom_label_repel(aes(label = aetiology,
                    fill = factor(aetiology)), color = 'white',
                    size = 3.5) +
   theme(legend.position = "none", panel.spacing.x = unit(0, "mm"), text = element_text(size=16))

Signature frequency by arbitrary cutoffs

hist(c(test_23Q4_ccle_maf.tnm.sig$contributions))

sig.cutoff = 0.01
sig.freq.samples = t(test_23Q4_ccle_maf.tnm.sig$contributions>sig.cutoff)

sig.freq.samples.lineages = metadata[match(rownames(sig.freq.samples), metadata$SequencingID), 'Lineage']

lineages <- c()
sig.freq.list <- list()
for (lineage in unique(sig.freq.samples.lineages)) {
  if (length(dim(sig.freq.samples[sig.freq.samples.lineages==lineage,]))!=0) {
    sig.freq.samples.each = sig.freq.samples[sig.freq.samples.lineages==lineage,]
    print(dim(sig.freq.samples.each))
    sig.freq.list[[lineage]] <- colSums(sig.freq.samples.each)
    lineages <- c(lineage, lineages)
  }
}
## [1] 47 60
## [1] 178  60
## [1] 97 60
## [1] 77 60
## [1] 413  60
## [1] 130  60
## [1] 28 60
## [1] 52 60
## [1] 63 60
## [1] 191  60
## [1] 47 60
## [1] 115  60
## [1] 111  60
## [1] 193  60
## [1] 211  60
## [1] 96 60
## [1] 70 60
## [1] 136  60
## [1] 129  60
## [1] 173  60
## [1] 34 60
## [1] 41 60
## [1] 97 60
## [1] 88 60
## [1] 14 60
## [1] 30 60
## [1] 25 60
## [1]  4 60
## [1] 28 60
## [1]  6 60
## [1]  3 60
## [1]  5 60
## [1]  2 60
sig.freq.mat = Reduce(rbind, sig.freq.list)
rownames(sig.freq.mat) = lineages
colnames(sig.freq.mat) = sapply(test_23Q4_ccle_maf.tnm.sig.cosm$best_match, function(x) x$aetiology)

pheatmap(mat = sig.freq.mat, cluster_rows = FALSE, cluster_cols = FALSE, main = "Mutation signatures in different lineages", width=20, color = colorRampPalette(c("white", "red"))(30))

Lung lineage signature

LUAD_maf.tnm = trinucleotideMatrix(maf = LUAD_maf, prefix = '', add = TRUE, ref_genome = "BSgenome.Hsapiens.UCSC.hg38")
## -Extracting 5' and 3' adjacent bases
## -Extracting +/- 20bp around mutated bases for background C>T estimation
## -Estimating APOBEC enrichment scores
## --Performing one-way Fisher's test for APOBEC enrichment
## ---APOBEC related mutations are enriched in  24.59 % of samples (APOBEC enrichment score > 2 ;  30  of  122  samples)
## -Creating mutation matrix
## --matrix of dimension 122x96
#LUAD_maf.sign = estimateSignatures(mat = LUAD_maf.tnm, nTry = 5)

#plotCophenetic(res = LUAD_maf.sign)

LUAD_maf.tnm.sig = extractSignatures(mat = LUAD_maf.tnm, n = 2)
## -Running NMF for factorization rank: 2
## -Finished in3.271s elapsed (2.493s cpu)
plotSignatures(nmfRes = LUAD_maf.tnm.sig, title_size = 1.5,
               ig_db = "SBS", axis_lwd=2)
## Warning in plot.window(xlim, ylim, log = log, ...): "ig_db" is not a graphical
## parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "ig_db" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "ig_db" is not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "ig_db" is not
## a graphical parameter
## Warning in plot.window(xlim, ylim, log = log, ...): "ig_db" is not a graphical
## parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "ig_db" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "ig_db" is not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "ig_db" is not
## a graphical parameter

Skin lineage signature

SKCM_maf.tnm = trinucleotideMatrix(maf = SKCM_maf, prefix = '', add = TRUE, ref_genome = "BSgenome.Hsapiens.UCSC.hg38")
## -Extracting 5' and 3' adjacent bases
## -Extracting +/- 20bp around mutated bases for background C>T estimation
## -Estimating APOBEC enrichment scores
## --Performing one-way Fisher's test for APOBEC enrichment
## ---APOBEC related mutations are enriched in  0 % of samples (APOBEC enrichment score > 2 ;  0  of  24  samples)
## -Creating mutation matrix
## --matrix of dimension 24x96
#SKCM_maf.sign = estimateSignatures(mat = SKCM_maf.tnm, nTry = 5)

#plotCophenetic(res = SKCM_maf.sign)

SKCM_maf.tnm.sig = extractSignatures(mat = SKCM_maf.tnm, n = 2)
## -Running NMF for factorization rank: 2
## -Finished in1.154s elapsed (1.117s cpu)
plotSignatures(nmfRes = SKCM_maf.tnm.sig, title_size = 1.5, ig_db = "SBS", axis_lwd=2)
## Warning in plot.window(xlim, ylim, log = log, ...): "ig_db" is not a graphical
## parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "ig_db" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "ig_db" is not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "ig_db" is not
## a graphical parameter
## Warning in plot.window(xlim, ylim, log = log, ...): "ig_db" is not a graphical
## parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "ig_db" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "ig_db" is not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "ig_db" is not
## a graphical parameter